Pima indian dataset

We will make use of the Pima indian dataset again, as in the previous Markdown document. Importantly, note that in this setting we are not standardizing the predictors. This usually leads to additional difficulties as the scale of the predictors vary.

Pima <- rbind(MASS::Pima.tr, MASS::Pima.te)
y <- as.numeric(Pima$type == "Yes") # Binary outcome
X <- cbind(1, model.matrix(type ~ . - 1, data = Pima)) # Design matrix

As before, we will employ a relatively vague prior centered at \(0\), namely \(\beta \sim N(0, 100 I_p)\). Then, we implement the log-likelihood, the log-posterior and the gradient of the likelihood.

# Log-likelihood of a logistic regression model
loglik <- function(beta, y, X) {
  eta <- c(X %*% beta)
  sum(y * eta - log(1 + exp(eta)))
}

# Log-posterior
logpost <- function(beta, y, X) {
  loglik(beta, y, X) + sum(dnorm(beta, 0, 10, log = T))
}

# Gradient of the logposterior
lgradient <- function(beta, y, X) {
  probs <- plogis(c(X %*% beta))
  loglik_gr <- c(crossprod(X, y - probs))
  prior_gr <- -beta / 100
  loglik_gr + prior_gr
}

# Summary table for the 6 considered methods
summary_tab <- matrix(0, nrow = 6, ncol = 4)
colnames(summary_tab) <- c("Seconds", "Average ESS", "Average ESS per sec", "Average acceptance rate")
rownames(summary_tab) <- c("MH Laplace + Rcpp", "MALA", "MALA tuned", "HMC", "STAN", "Pólya-Gamma")

Metropolis Hastings (Laplace) and Rcpp

We first consider a random walk Metropolis-Hastings algorithm based on

library(coda)
R <- 30000
burn_in <- 5000
set.seed(123)

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
p <- ncol(X)
S <- 2.38^2 * vcov(fit_logit) / p

# Running the MCMC
start.time <- Sys.time()
# MCMC
fit_MCMC <- as.mcmc(RMH_arma(R, burn_in, y, X, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()

time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1103    1154    1163    1166    1192    1204 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  24.93   25.16   25.79   25.75   26.01   27.19 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.2682  0.2682  0.2682  0.2682  0.2682  0.2682 

# Summary statistics
summary_tab[1, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

MALA algoritm

# R represent the number of samples
# burn_in is the number of discarded samples
# epsilon, S are tuning parameter
MALA <- function(R, burn_in, y, X, epsilon, S) {
  p <- ncol(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values
  beta <- rep(0, p) # Initial values
  A <- chol(S) # Cholesky of S
  S1 <- solve(S) # Inverse of S

  lgrad <- c(S %*% lgradient(beta, y, X)) # Compute the gradient
  logp <- logpost(beta, y, X)

  sigma2 <- epsilon^2 / p^(1 / 3)
  sigma <- sqrt(sigma2)

  # Starting the Gibbs sampling
  for (r in 1:(burn_in + R)) {
    beta_new <- beta + sigma2 / 2 * lgrad + sigma * c(crossprod(A, rnorm(p)))

    logpnew <- logpost(beta_new, y, X)
    lgrad_new <- c(S %*% lgradient(beta_new, y, X))

    diffold <- beta - beta_new - sigma2 / 2 * lgrad_new
    diffnew <- beta_new - beta - sigma2 / 2 * lgrad

    qold <- -diffold %*% S1 %*% diffold / (2 * sigma2)
    qnew <- -diffnew %*% S1 %*% diffnew / (2 * sigma2)

    alpha <- min(1, exp(logpnew - logp + qold - qnew))
    if (runif(1) < alpha) {
      logp <- logpnew
      lgrad <- lgrad_new
      beta <- beta_new # Accept the value
    }
    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
set.seed(123)

epsilon <- 0.0017 # After some trial ad error

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S = diag(ncol(X)))) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.900   9.358  27.201  44.321  46.238 166.223 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  180.5   728.7  1208.5  2671.3  3283.2 10343.4 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5638  0.5638  0.5638  0.5638  0.5638  0.5638 

# Summary statistics
summary_tab[2, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

MALA algorithm with pre-conditioning

library(coda)
R <- 30000
burn_in <- 5000
set.seed(123)

epsilon <- 1.68 # After some trial ad error

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   8583    8762    9196    9063    9312    9409 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  3.189   3.222   3.263   3.314   3.424   3.495 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5686  0.5686  0.5686  0.5686  0.5686  0.5686 

# Summary statistics
summary_tab[3, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Hamiltonian Monte Carlo

HMC <- function(R, burn_in, y, X, epsilon, S, L = 10) {
  
  p <- ncol(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values
  beta <- rep(0, p) # Initial values
  logp <- logpost(beta, y, X) # Initial log-posterior
  S1 <- solve(S)
  A1 <- chol(S1)

  # Starting the Gibbs sampling
  for (r in 1:(burn_in + R)) {
    
    P <- c(crossprod(A1, rnorm(p))) # Auxiliary variables
    logK <- c(P %*% S %*% P / 2) # sum(P^2) / 2 # Kinetic energy at the beginning of the trajectory
    
    # Make a half step for momentum at the beginning
    beta_new <- beta
    Pnew <- P + epsilon * lgradient(beta_new, y, X) / 2
    
    # Alternate full steps for position and momentum
    for (l in 1:L) {
      # Make a full step for the position
      beta_new <- beta_new + epsilon * c(S %*% Pnew)
      # Make a full step for the momentum, except at end of trajectory
      if (l != L) Pnew <- Pnew + epsilon * lgradient(beta_new, y, X)
    }
    # Make a half step for momentum at the end.
    Pnew <- Pnew + epsilon * lgradient(beta_new, y, X) / 2

    # Negate momentum at end of trajectory to make the proposal symmetric
    Pnew <- - Pnew

    # Evaluate potential and kinetic energies at the end of trajectory
    logpnew <- logpost(beta_new, y, X)
    logKnew <- Pnew %*% S %*% Pnew / 2 #sum(Pnew^2) / 2
    
    # Accept or reject the state at end of trajectory, returning either
    # the position at the end of the trajectory or the initial position
    if (runif(1) < exp(logpnew - logp + logK - logKnew)) {
      logp <- logpnew
      beta <- beta_new # Accept the value
    }

    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
set.seed(123)

epsilon <- 0.25 # After some trial ad error
L <- 10

# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(HMC(R = R, burn_in = burn_in, y, X, epsilon, S, L)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 215765  222946  226610  225565  228360  233334 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.1286  0.1314  0.1324  0.1331  0.1346  0.1390 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.9892  0.9892  0.9892  0.9892  0.9892  0.9892 

# Summary statistics
summary_tab[4, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)

# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Hamiltonian Monte Carlo (Stan)

library(rstan)

# I am not counting the compilation time
stan_compiled <- stan_model(file = "logistic.stan") # Stan program
set.seed(1234)
# Running the MCMC
start.time <- Sys.time()
fit_HMC <- sampling(
  stan_compiled, # The stan file has been previously compiled
  data = list(X = X, y = y, n = nrow(X), p = ncol(X)), # named list of data
  chains = 1, # number of Markov chains
  warmup = burn_in, # Burn-in iterations per chain
  iter = R + burn_in # Total number of iterations per chain
)

SAMPLING FOR MODEL 'logistic' NOW (CHAIN 1).
Chain 1: 
Chain 1: Gradient evaluation took 9.6e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1: 
Chain 1: 
Chain 1: Iteration:     1 / 35000 [  0%]  (Warmup)
Chain 1: Iteration:  3500 / 35000 [ 10%]  (Warmup)
Chain 1: Iteration:  5001 / 35000 [ 14%]  (Sampling)
Chain 1: Iteration:  8500 / 35000 [ 24%]  (Sampling)
Chain 1: Iteration: 12000 / 35000 [ 34%]  (Sampling)
Chain 1: Iteration: 15500 / 35000 [ 44%]  (Sampling)
Chain 1: Iteration: 19000 / 35000 [ 54%]  (Sampling)
Chain 1: Iteration: 22500 / 35000 [ 64%]  (Sampling)
Chain 1: Iteration: 26000 / 35000 [ 74%]  (Sampling)
Chain 1: Iteration: 29500 / 35000 [ 84%]  (Sampling)
Chain 1: Iteration: 33000 / 35000 [ 94%]  (Sampling)
Chain 1: Iteration: 35000 / 35000 [100%]  (Sampling)
Chain 1: 
Chain 1:  Elapsed Time: 10.9396 seconds (Warm-up)
Chain 1:                67.1344 seconds (Sampling)
Chain 1:                78.0739 seconds (Total)
Chain 1: 
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

fit_HMC <- as.mcmc(extract(fit_HMC)$beta)

# Diagnostic
summary(effectiveSize(fit_HMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  28917   30000   30000   29865   30000   30000 
summary(R / effectiveSize(fit_HMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.005   1.000   1.037 
summary(1 - rejectionRate(fit_HMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 

# Summary statistics
summary_tab[5, ] <- c(
  time_in_sec, mean(effectiveSize(fit_HMC)),
  mean(effectiveSize(fit_HMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_HMC))
)

# Traceplot of the intercept
plot(fit_HMC[, 1:2])

Pólya-gamma data-augmentation

library(BayesLogit)

logit_Gibbs <- function(R, burn_in, y, X, B, b) {
  p <- ncol(X)
  n <- nrow(X)
  out <- matrix(0, R, p) # Initialize an empty matrix to store the values

  P <- solve(B) # Prior precision matrix
  Pb <- P %*% b # Term appearing in the Gibbs sampling

  Xy <- crossprod(X, y - 1 / 2)

  # Initialization
  beta <- rep(0, p)

  # Iterative procedure
  for (r in 1:(R + burn_in)) {

    # Sampling the Pólya-gamma latent variables
    eta <- c(X %*% beta)
    omega <- rpg.devroye(num = n, h = 1, z = eta)

    # Sampling beta
    eig <- eigen(crossprod(X * sqrt(omega)) + P, symmetric = TRUE)

    Sigma <- crossprod(t(eig$vectors) / sqrt(eig$values))
    mu <- Sigma %*% (Xy + Pb)

    A1 <- t(eig$vectors) / sqrt(eig$values)
    beta <- mu + c(matrix(rnorm(1 * p), 1, p) %*% A1)

    # Store the values after the burn-in period
    if (r > burn_in) {
      out[r - burn_in, ] <- beta
    }
  }
  out
}
B <- diag(100, ncol(X)) # Prior covariance matrix
b <- rep(0, ncol(X)) # Prior mean
set.seed(123)

# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(logit_Gibbs(R, burn_in, y, X, B, b)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))

# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10018   13592   15411   15182   17369   18900 
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.587   1.728   1.950   2.051   2.209   2.995 
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      1       1       1       1       1       1 

# Summary statistics
summary_tab[6, ] <- c(
  time_in_sec, mean(effectiveSize(fit_MCMC)),
  mean(effectiveSize(fit_MCMC)) / time_in_sec,
  1 - mean(rejectionRate(fit_MCMC))
)


# Traceplot of the intercept
plot(fit_MCMC[, 1:2])

Results

Seconds Average ESS Average ESS per sec Average acceptance rate
MH Laplace + Rcpp 0.561887 1165.75640 2074.71670 0.2682089
MALA 2.743667 44.32131 16.15404 0.5637855
MALA tuned 2.624531 9063.31983 3453.31022 0.5686190
HMC 14.735173 225565.16881 15307.94169 0.9892330
STAN 78.846092 29864.58683 378.77067 1.0000000
Pólya-Gamma 10.254378 15181.91528 1480.53009 1.0000000